%% Description
%{
ARB files can be exported in XML format.
 
inforamation:
root=
<ARB_SEQ_EXPORT database="RHCyanobacteria1fm.arb" export_date="Wed Dec  3 12:23:47 2008">
root end=
</ARB_SEQ_EXPORT>

example spezies data=
<species name="UncSy654">
  <acc>AY125384</acc>
  <ALIGNMENT name="16s">
   <data>AGAGUU ... ACGG</data>
  </ALIGNMENT>
  <author>Zubkov M.V.; Fuchs B.M.; </author>
  <clone>A315024</clone>
  <date>2003-02-25;</date>
  <description>Uncultured Synechococcus sp. clone A315024 16S ribosomal RNA gene, partial sequence.</description>
  <full_name>uncultured Synechococcus sp.</full_name>
  <product>16S ribosomal RNA</product>
  <journal>Appl. Environ. Microbiol. 69:1299-1304 (2003)</journal>
  <title>High rate of uptake of organic nitrogen compounds by Prochlorococcus cyanobacteria as a key to their dominance in oligotrophic oceanic waters</title>
  <version>1</version>
  <nuc_region>1..1216</nuc_region>
  <nuc_rp>1-1216</nuc_rp>
  <submit_author>Zubkov M.V.; Fuchs B.M.; Tarran G.A.; Burkill P.H.; Amann R.; ; </submit_author>
  <submit_date>21-JUN-2002 Plymouth Marine Laboratory, PL1 3DH, Plymouth PL1 3DH, United Kingdom</submit_date>
  <align_bp_score_slv>115</align_bp_score_slv>
  <align_cutoff_head_slv>0</align_cutoff_head_slv>
  <align_cutoff_tail_slv>0</align_cutoff_tail_slv>
  <align_family_slv>UncSyn36:0.981758 Aq9A0y06:0.981758 UncSyn19:0.965174 UncSyn11:0.965174 Aq9A0yy5:0.965174 Acyyyy07:0.965174 SynSp160:0.955224 SynSp136:0.955224 UncSyn12:0.955224 SynSp164:0.955224 Aq9A0yyy:0.955224 SynSp162:0.955224 UncSyn30:0.955224 SynSp155:0.955224 SynSp140:0.955224 UncSyn35:0.955224 Acyyyy02:0.955224 UncSyn32:0.955224 Aq9A0y05:0.955224 Aq9A0yy4:0.955224 SynSp135:0.955224 SynSp143:0.955224 SynSp165:0.955224 SynSp145:0.955224 Acyyyy05:0.94859 UncSyn10:0.94859 SynSp107:0.945274 Aq9A0yy6:0.945274 Acyyyy08:0.945274 Aq9A0yy0:0.945274 UncSyn14:0.945274 UncSyn28:0.945274 UncSyn20:0.945274 Aq9A0yy2:0.945274 UncSyn34:0.945274 UncSyn26:0.93864 Acyyyy03:0.93864 Acyyy003:0.935323 UncSyn13:0.935323 Aq9A0y03:0.932007 </align_family_slv>
  <align_log_slv>copied identical ARB_C81988E9.1</align_log_slv>
  <align_quality_slv>100</align_quality_slv>
  <aligned_slv>2008-09-30 19:13:41</aligned_slv>
  <ambig_slv>0</ambig_slv>
  <ann_src_slv>EMBL; RDP;</ann_src_slv>
  <pintail_slv>100</pintail_slv>
  <homop_slv>0.41</homop_slv>
  <homop_events_slv>5</homop_events_slv>
  <nuc_gene_slv>1216</nuc_gene_slv>
  <publication_doi>10.1128/AEM.69.2.1299-1304.2003</publication_doi>
  <pubmed_id>12571062</pubmed_id>
  <seq_quality_slv>84</seq_quality_slv>
  <start>1</start>
  <stop>1216</stop>
  <tax_embl>Bacteria;Cyanobacteria;Chroococcales;Synechococcus;environmental samples;</tax_embl>
  <tax_embl_name>uncultured Synechococcus sp.</tax_embl_name>
  <tax_gg>Unclassified;</tax_gg>
  <tax_gg_name>uncultured Synechococcus sp.</tax_gg_name>
  <tax_rdp>Root;Bacteria;Cyanobacteria;Cyanobacteria;Family II;GpIIa;</tax_rdp>
  <tax_rdp_name>uncultured Synechococcus sp.</tax_rdp_name>
  <tax_xref_embl>154535</tax_xref_embl>
  <vector_slv>1.23</vector_slv>
  <nuc_term>1216</nuc_term>
  <tax_slv>Bacteria/Cyanobacteria/Chroococcales/Prochlorococcus et rel./Synechococcus sp.</tax_slv>
  <tmp> </tmp>
  <nuc>863</nuc>
 </species>

tree structure=
<ARB_SEQ_EXPORT database="RHCyanobacteria1fm.arb" export_date="Wed Dec  3 12:23:47 2008">
    <species name="species1">
        <acc>xx1</acc>
        <ALIGNMENT name="16s">
            <data>AGAGUU ... ACGG</data>
        </ALIGNMENT>
        <nuc>863</nuc>
    </species>
    <species name="species2">
        <acc>xx2</acc>
        <ALIGNMENT name="16s">
            <data>AGAGUU ... ACGG</data>
        </ALIGNMENT>
        <nuc>863</nuc>
    </species>
</ARB_SEQ_EXPORT>


xmlread MATLAB function reads XML files
 
regexp, regexpi MATLAB function searches strings in strings (e.g. probe-string in sequence-string)
%}

%% Clear workspace
close all
clear all
clc


% begin first part


%% load xlm file
%{
xmlreadParse XML document and return Document Object Model node

Syntax
DOMnode = xmlread(filename)

DescriptionDOMnode = xmlread(filename) reads a URL or filename and returns a Document Object Model node
representing the parsed document. The filename input is a string enclosed in single quotes. The node can be 
manipulated by using standard DOM functions.

!! 
--> clear rows in XML file
<!DOCTYPE ARB_SEQ_EXPORT SYSTEM 'arb_seq_export.dtd' [
  <!ENTITY nbsp "&#160;">
  <!ENTITY acute "&#180;">
  <!ENTITY eacute "&#233;">
  <!ENTITY apostr "&#39;">
  <!ENTITY semi "&#59;"> ]>

--> and clear
<!--There ... s a basic version of ARB_seq_export.dtd in /arb/software/arbmgg071207_32/lib/dtd
but you might need to expand it by yourself,
because the ARB-database may contain any kind of fields.-->

--> afterwords search "apostr" in XML file and set " ... "
!!

if XML file is in MATLAB as variable
MATLAB returns:
DOMnode =
 
[#document: null]

%DOMnode = xmlread('CyanobacAllfm.xml')
%}
%{
This function parses an XML file using methods of the DOM node returned
by xmlread, and stores the data it reads in the Name, Attributes, Data, and Children fields of a MATLAB structure:function theStruct = parseXML(filename)

% PARSEXML Convert XML file to a MATLAB structure.

filename = 'CyanobacAllfm.xml';
try
   tree = xmlread(filename);
catch
   error('Failed to read XML file %s.',filename);
end

% Recurse over child nodes. This could run into problems with very deeply nested trees.
try
   arbTree = parseChildNodes(tree);
catch
   error('Unable to parse XML file %s.',filename);
end

% show loading is ready
strcat(filename,' is loaded (',arbTree.Name,')')
database = arbTree.Attributes(1).Value
export_date = arbTree.Attributes(2).Value


%% reduce data of export file
% only used childs (species)
species = arbTree.Children;
numberOfChildren = length(arbTree.Children);
species(1:2:numberOfChildren)=[];

% show number of spezies
numberOfSpezies = length(species)

% reduce primary data fields
for i=1:length(species)
    species(i).Name=species(i).Attributes.Value;
end

% remove empty primary data fields
species = rmfield(species,'Data');
species = rmfield(species,'Attributes');

% only used childs of each species
for i=1:numberOfSpezies
    species(i).Children(1:2:length(species(i).Children))=[];
end
%}
% importante fields into new struct
%{
<acc>AY125384</acc>
<ALIGNMENT name="16s">
   <data>GUG..GG</data>
</ALIGNMENT>
<full_name>uncultured Synechococcus sp.</full_name>

for i=1:numberOfSpezies
    for j=1:length(species(i).Children)
        if(length(species(i).Children(j).Name) == 3)  
            if(species(i).Children(j).Name == 'acc')
                newSpecies(i).Children(1).Name = species(i).Children(j).Name;
                newSpecies(i).Children(1).Data = cellstr(species(i).Children(j).Children(1).Data);
            end
        end
        
        if(length(species(i).Children(j).Name) == 9)
            if(species(i).Children(j).Name == 'ALIGNMENT')
                newSpecies(i).Children(2).Name = 'sequence';
                newSpecies(i).Children(2).Data = species(i).Children(j).Children(2).Children.Data;
            end
        end
        
        if(length(species(i).Children(j).Name) == 9)
            if(species(i).Children(j).Name == 'full_name')
                newSpecies(i).Children(3).Name = species(i).Children(j).Name;
                newSpecies(i).Children(3).Data = cellstr(species(i).Children(j).Children.Data);
            end
        end
    end
end

%}
% END of first part

%% load created mat data


load('ARBtree.mat')
% structs and variables
%{
structs:            tree
                    arbTree
                    species
                    newSpecies

string variables:   primerF
                    primerR
                    probe_sequence
%}


%% show distribution of sequence length
for i=1:length(newSpecies)
    sequenceLength(i) = length(newSpecies(i).Children(2).Data);
end
    
figure
histSteps = max(sequenceLength)-min(sequenceLength);
hist(sequenceLength,histSteps)
%xlabel('Sequence length [bp]')
%ylabel('Hits')
title('Sequence length distribution')


%% search probbe-string in sequence-string
%{
%struct of read xml
% all species
%newSpecies(1) - newSpecies(numberOfSpecies)

% all info fields
%"acc"
newSpecies(i).Children(1).Name = 'acc'
newSpecies(i).Children(1).Data

%"sequence"
newSpecies(i).Children(2).Name = 'sequence'
newSpecies(i).Children(2).Data

%"full_name"
newSpecies(i).Children(3).Name = 'full_name'
newSpecies(i).Children(3).Data
%}

%{
regexp, regexpi Match regular expression

Syntax
regexp('str', 'expr')
[start_idx, end_idx, extents, matches, tokens, names, splits] = regexp('str', 'expr')
[v1, v2, ...] = regexp('str', 'expr', q1,q2, ...)
[v1 v2 ...] = regexp('str', 'expr', ..., options)

Each of these syntaxes apply to both regexp and regexpi.
The regexp function is case sensitive in matching regular expressions to a string, and regexpi is case 
insensitive. 

Description
The following descriptions apply to both regexp and regexpi:
regexp('str', 'expr') returns a row vector containing the starting index of each substring of str that matches 
the regular expression string expr. If no matches are found, regexp returns an empty array. The str and expr 
arguments can also be cell arrays of strings. See Regular Expressions in the MATLAB Programming documentation 
for more information.
%}


%%%%%%%%%%%%%%%
% match AluI sequence %%%%%
% AluI (AGCT) T=U
probe_sequence = 'AGCU';
for i=1:length(newSpecies)
    matchIndex = regexpi(newSpecies(i).Children(2).Data,probe_sequence);
    matchIndex = matchIndex + 1;
    numberOfCuts(i) = length(matchIndex);
    cuts(i,1:numberOfCuts(i)) = matchIndex;
end

% forward fragments of full digest
for i=1:length(newSpecies)
    if(cuts(i,1) == 0)
        fullDigestFragments(i) = length(newSpecies(i).Children(2).Data);
    else
        fullDigestFragments(i) = cuts(i,1);
    end
end

% t-RFLP pattern forward
figure
% [n,xout] = hist(...) returns vectors n and xout containing the frequency
% counts and the bin locations.
hist(fullDigestFragments,min(fullDigestFragments):1:max(fullDigestFragments))
title('Distribution of forward fragments of full digest')


%%%%%%%%%%%%%%%
% calculate reverse fragments %%%%%
% for each species: full length of sequence between both primers subtracted
% by each cut site
for i=1:size(cuts,1)
    for j=1:size(cuts,2)
        if(cuts(i,j) == 0)
        else
            tRFLPreverseFragments(i,j) = length(newSpecies(i).Children(2).Data) - cuts(i,j);
        end
    end
end

% reverse fragments of full digest
for i=1:length(newSpecies)
    if(tRFLPreverseFragments(i,numberOfCuts(i)) == 0)
        fullDigestFragmentsReverse(i) = length(newSpecies(i).Children(2).Data);
    else
        fullDigestFragmentsReverse(i) = tRFLPreverseFragments(i,numberOfCuts(i));
    end
end

% t-RFLP pattern reverse
figure
% [n,xout] = hist(...) returns vectors n and xout containing the frequency
% counts and the bin locations.
hist(fullDigestFragmentsReverse,min(fullDigestFragmentsReverse):1:max(fullDigestFragmentsReverse))
title('Distribution of reverse fragments of full digest')


%%%%%%%%%%%%%%%
% t-RFLP pattern of both
tRFLPfragments(1:length(fullDigestFragments),1)=fullDigestFragments';
tRFLPfragments(1:length(fullDigestFragmentsReverse),2)=fullDigestFragmentsReverse';
figure
hist(tRFLPfragments,min(min(tRFLPfragments)):1:max(max(tRFLPfragments)))
title('Distribution of fragments of full digest')

figure
hist(numberOfCuts,min(min(numberOfCuts)):1:max(max(numberOfCuts)))
title('Distribution of amount of cut sites')


%%%%%%%%%%%%%%%
% control %%%%%
% all forward and reverse primers should match
% forward primer ='primerF'
numberOfForwardMatches = 0;
for i=1:length(newSpecies)
    matchIndex = regexpi(newSpecies(i).Children(2).Data,primerF,'end');
    matchForward(i,1:length(matchIndex)) = matchIndex;
    if(isnan(matchIndex))
    else
        numberOfForwardMatches = numberOfForwardMatches + 1;
    end
end
forwardMatch = numberOfForwardMatches/length(newSpecies)

% reverse primer ='primerR'
%rcString = MakeRCString(primerR);
numberOfReverseMatches = 0;
for i=1:length(newSpecies)
    matchIndex = regexpi(newSpecies(i).Children(2).Data,primerRrc,'start');
    matchReverse(i,1:length(matchIndex)) = matchIndex;
    if(isnan(matchIndex))
    else
        numberOfReverseMatches = numberOfReverseMatches + 1;
    end
end
reverseMatch = numberOfReverseMatches/length(newSpecies)


%% calculate GC-Content
% formula for the weight:
% MW = [n(AMP)x 312.19 g/mol] + [n(CMP)x 288.17 g/mol]
%      + [n(GMP)x 328.19 g/mol] + [n(TMP)x 303.18 g/mol]
%      + 18.02 g/mol

% amount of nucleotides
for i=1:length(newSpecies)
    numberOfNucleotides(i,1) = length(regexpi(newSpecies(i).Children(2).Data,'a'));
    numberOfNucleotides(i,2) = length(regexpi(newSpecies(i).Children(2).Data,'g'));
    numberOfNucleotides(i,3) = length(regexpi(newSpecies(i).Children(2).Data,'c'));
    numberOfNucleotides(i,4) = length(regexpi(newSpecies(i).Children(2).Data,'u'));
    % GC content [%]
    numberOfNucleotides(i,5) = 100*(numberOfNucleotides(i,2)+ numberOfNucleotides(i,3))/sum(numberOfNucleotides(i,1:4));
end

figure
hist(numberOfNucleotides(:,5),min(numberOfNucleotides(:,5)):1:max(numberOfNucleotides(:,5)))
title('Distribution of GC content [%]')

% moleculare weight
weights = numberOfNucleotides(:,1)*312.19;                  % AMP [g/mol]
weights(:,2) = numberOfNucleotides(:,2)*328.19;             % GMP [g/mol]
weights(:,3) = numberOfNucleotides(:,3)*288.17;             % CMP [g/mol]
weights(:,4) = numberOfNucleotides(:,4)*303.18;             % TMP [g/mol]

% total [g/mol]
weights(:,5) = sum(weights(:,1:4),2);
weights(:,5) = weights(:,5) + 18.02;    

% relative fragments length
ssnucleotideWeight = (312.19 + 328.19 + 288.17 + 303.18)/4;
weights(:,6) = weights(:,5) ./ ssnucleotideWeight;      

% difference between fragments length and relative fragments length
weights(:,7) = weights(:,6) - sequenceLength';

figure
hist(weights(:,7),min(weights(:,7)):1:max(weights(:,7)))
title('Distribution of difference between fragment lengths')

weights(:,8) = sum(weights(:,2:3),2);
weights(:,8) = 100*weights(:,8) ./ weights(:,5);      % GC content [mol%]

figure
hist(weights(:,8),min(weights(:,8)):1:max(weights(:,8)))
title('Distribution of GC content [mol%]')


%% export to Excel
% species names
for i=1:length(newSpecies)
    speciesNames(i) = newSpecies(i).Children(1).Data;
    fullSpeciesNames(i) = newSpecies(i).Children(3).Data;
end

% forward fragments
%xlswrite('speciesSpecificFragments.xls',header,'ff','A1')
xlswrite('speciesSpecificFragments.xls',speciesNames','ff','A2')
xlswrite('speciesSpecificFragments.xls',cuts,'ff','B2')

% full forward fragments
xlswrite('speciesSpecificFragments.xls',speciesNames','fulldigestFF','A2')
xlswrite('speciesSpecificFragments.xls',fullDigestFragments','fulldigestFF','B2')
xlswrite('speciesSpecificFragments.xls',fullSpeciesNames','fulldigestFF','C2')

% reverse fragments
xlswrite('speciesSpecificFragments.xls',speciesNames','rf','A2')
xlswrite('speciesSpecificFragments.xls',tRFLPreverseFragments,'rf','B2')

% full reverse fragments
xlswrite('speciesSpecificFragments.xls',speciesNames','fulldigestRF','A2')
xlswrite('speciesSpecificFragments.xls',fullDigestFragmentsReverse','fulldigestRF','B2')
xlswrite('speciesSpecificFragments.xls',fullSpeciesNames','fulldigestRF','C2')

% full length of sequence
for i=1:length(newSpecies)
    speciesSequenceLength(i) = length(newSpecies(i).Children(2).Data);
end
xlswrite('speciesSpecificFragments.xls',speciesNames','length','A2')
xlswrite('speciesSpecificFragments.xls',speciesSequenceLength','length','B2')
xlswrite('speciesSpecificFragments.xls',fullSpeciesNames','length','C2')

% number of cuts
xlswrite('speciesSpecificFragments.xls',speciesNames','cleavingSites','A2')
xlswrite('speciesSpecificFragments.xls',numberOfCuts','cleavingSites','B2')
xlswrite('speciesSpecificFragments.xls',fullSpeciesNames','cleavingSites','C2')


